#### Wordfish Analysis ####

# Set your working directory to Data subdirectory of the replication repo. For
# us, this looks like:
setwd("~/Desktop/Replication_Materials")

# Install preText:
install.packages("preText")

# Load the packages:
library(preText)
library(ggplot2)

# Load the data:
load("./Data/UK_Manifestos.RData")

# load in labels for plotting:
load("./Data/128_Combination_Preprocessing_Labels.RData")

# Preprocess the data (note we use a higher threshold here since there are only
# 69 documents and we want to exclude terms that do not appear in at least two
# of them):
uk_manifestos_fact_prep <- factorial_preprocessing(documents,
                                                   infrequent_term_threshold = 0.02)


# fix document names by removing .txt ending
remove_txt <- function(str){
    stringr::str_split(str,"\\.")[[1]][1]
}
for(i in 1:128){
    uk_manifestos_fact_prep$dfm_list[[i]]@Dimnames$docs <- as.character(
        sapply(uk_manifestos_fact_prep$dfm_list[[i]]@Dimnames$docs,remove_txt))
}

# Get the years each document was written and store them as a numeric vector:
dfm <- uk_manifestos_fact_prep$dfm_list[[1]]
rl <- function(str) {
    stringr::str_replace_all(str,"[A-Za-z\\.]+","")
}
years <- as.numeric(sapply(rownames(dfm),rl))

# use the wordfish_comparison function to compare all dfms. We are using
# conservative and labour manifestos from 1983, 1987, 1992, and 1997 for a total
# of 8 manifestos. These are indicated by the document_inidices = c(42:45,19:22)
# argument. You can see the document names by entering rownames(dfm) into the
# console. We need to set the anchors to 1,5 because anchoring is applied in the
# reduced dfm. We are also only including terms that appear at least once in a
# manifesto from each of the 4 years, to deal with the strong temporal effects.
wordfish_results <- wordfish_comparison(
    uk_manifestos_fact_prep$dfm_list,
    years,
    anchors = c(1,5),
    proportion_threshold = 1,
    document_inidices = c(42:45,19:22))

# Our a priori rankings of documents:
ranking = c("Lab1983","Lab1987","Lab1992","Lab1997",
            "Con1992", "Con1997","Con1987","Con1983")

# this is a variant of the wordfish_rank_plot function in the preText R package
# that orders the output by how similar it is to the apriori ranking:
source('wordfish_rank_plot_apriori_ordering.R')

# Generate the plot (note that this plot will have rows ordered by number of
# "incorrect" rankings, as in paper):
pdf(file = "Wordfish_Rank_UK.pdf", height = 10, width = 5)
wordfish_rank_plot_apriori_ordering(wordfish_results,
                   labels,
                   invert = FALSE,
                   ranking = c("Lab1983","Lab1987","Lab1992","Lab1997",
                               "Con1992", "Con1997","Con1987","Con1983"),
                   black_white = TRUE,
                   one_matrix = TRUE)
dev.off()


